This R Markdown document is used to analysis Differentially Expressed Genes from GSE42589.
0.1 Step0-Packages Preparation
This step will check and install all Packages included in this analysis flow.
#source("step0-install.R")0.2 Step1-Download
This step will download and make a quick check of GSE data.
0.2.1 download data
Load library
library(AnnoProbe)
library(GEOquery) Data will be downloaded by geoChina
gset <- geoChina("GSE42589")## you can also use getGEO from GEOquery, by
## getGEO('GSE42589', destdir=".", AnnotGPL = F, getGPL = F)
a=gset[[1]]
dat=exprs(a) #a现在是一个对象,取a这个对象通过看说明书知道要用exprs这个函数获取并检查GPL信息
gpl = a@annotation
checkGPL(gpl)## [1] TRUE
看一下dat这个矩阵的维度
dim(dat)## [1] 33297 13
0.2.2 Check Data
0.2.2.1 Table
查看dat这个矩阵的1至4行和1至4列,逗号前为行,逗号后为列
dat[1:4,1:4]## GSM1045517 GSM1045518 GSM1045519 GSM1045520
## 7892501 5.413030 5.232056 5.590446 6.818197
## 7892502 3.771905 4.830217 3.867787 3.593544
## 7892503 4.395819 5.032963 4.819382 3.010962
## 7892504 9.622141 9.394719 9.363083 9.291167
0.2.3 Normalization
图中数据还算整齐. 查看soft文件33440行 Figure 2 和 文章(Kobayashi et al. 2013) 的Methods部分 可以看到已经用RMA(Robust Multi-array Average)方法进行标准化.Figure 2: soft文件记录的上游分析
所以应该可以不再用limma标准化了. 但是我想看看有啥不一样还是跑一下看看 Figure 3.
par(mfrow=c(1,2))
boxplot(dat,las=2, main="Before")
dat=normalizeBetweenArrays(dat)
boxplot(dat,las=2, main="After")Figure 3: quick boxplot check after limma normalized
看上去结果几乎是一样的,整齐了一点点而已
0.2.4 Clinical Information
使用 pData 获取临床数据
pd=pData(a)
DT::datatable(pd[,c("title","condition:ch1","gender:ch1")])挑选一些感兴趣的临床表型 分组
library(stringr)
group_list=ifelse(grepl('control',pd$title),'control','NSCLP')
table(group_list)## group_list
## control NSCLP
## 6 7
0.2.5 Probes Annotations
获取探针信息
#getGPLAnno returns probe annotations for input gpl
probe2gene=idmap(gpl,type='soft')
head(probe2gene)## ID symbol
## 1 7896736 <NA>
## 2 7896738 OR4G2P
## 3 7896740 OR4F4
## 4 7896742 LOC728323
## 5 7896744 OR4F29
## 6 7896746 MT-TM
筛选数据对应的探针
colnames(probe2gene)=c('probe_id','symbol')
probe2gene=probe2gene[probe2gene$symbol != '',]
ids = probe2gene
ids=ids[ids$probe_id %in% rownames(dat),]
dim(dat)## [1] 33297 13
dat=dat[as.character(ids$probe_id),]
dim(dat)## [1] 25293 13
筛选数据中值高的
ids$median=apply(dat,1,median) #ids新建median这一列,列名为median,同时对dat这个矩阵按行操作,取每一行的中位数,将结果给到median这一列的每一行
ids=ids[order(ids$symbol,ids$median,decreasing = T),]#对ids$symbol按照ids$median中位数从大到小排列的顺序排序,将对应的行赋值为一个新的ids
ids=ids[!duplicated(ids$symbol),]#将symbol这一列取取出重复项,'!'为否,即取出不重复的项,去除重复的gene ,保留每个基因最大表达量结果s
dat=dat[as.character(ids$probe_id),] #新的ids取出probe_id这一列,将dat按照取出的这一列中的每一行组成一个新的dat
rownames(dat)=ids$symbol#把ids的symbol这一列中的每一行给dat作为dat的行名
dim(dat)## [1] 23307 13
dat[1:4,1:4] #保留每个基因ID第一次出现的信息## GSM1045517 GSM1045518 GSM1045519 GSM1045520
## ZZZ3 8.410917 8.709046 8.515850 8.203891
## ZZEF1 7.584177 7.935231 7.295273 7.044309
## ZYX 11.880791 11.455879 11.976036 11.144108
## ZYG11B 8.738334 8.592477 8.736648 8.828810
0.2.6 Save results
保存这一步的结果供后使用
save(dat,group_list,file = 'step1-output.Rdata')0.3 step2-check
略
0.4 step3-DEG
这一步是差异表达基因分析 ### Preparation
载入上一步的结果
load(file = 'step1-output.Rdata')通过为每个数据集绘制箱型图,比较数据集中的数据分布 按照group_list分组画箱线图
boxplot(dat[1,]~group_list)Figure 4: boxplot with group
定义一个画图函数, 避免重复
bp=function(g){ #定义一个函数g,函数为{}里的内容
library(ggpubr)
df=data.frame(gene=g,stage=group_list)
p <- ggboxplot(df, x = "stage", y = "gene",
color = "stage", palette = "jco",
add = "jitter")
# Add p-value
p + stat_compare_means()
}
bp(dat[1,])## Loading required package: ggplot2
Figure 5: boxplot with group
分组信息转成因子格式
group_list = factor( group_list ,levels = c("control","NSCLP"))0.4.1 DEG
0.4.1.1 Design
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
head(design)## control NSCLP
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 1
## 6 0 1
exprSet=dat
rownames(design)=colnames(exprSet)
head(design)## control NSCLP
## GSM1045517 0 1
## GSM1045518 0 1
## GSM1045519 0 1
## GSM1045520 0 1
## GSM1045521 0 1
## GSM1045522 0 1
contrast.matrix<-makeContrasts("control-NSCLP",levels = design)
contrast.matrix ##这个矩阵声明,我们要把 NSCL/P 组跟 control 进行差异分析比较## Contrasts
## Levels control-NSCLP
## control 1
## NSCLP -1
0.4.1.2 DEG function
deg = function(exprSet,design,contrast.matrix){
##step1
fit <- lmFit(exprSet,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix)
##这一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)
return(nrDEG)
}0.4.1.3 access result and save
deg = deg(exprSet,design,contrast.matrix)
head(deg)## logFC AveExpr t P.Value adj.P.Val B
## ESM1 5.309799 7.273859 17.35499 2.597893e-11 6.054909e-07 15.30372
## RFC4 1.899358 7.636704 13.20487 1.215485e-09 1.416465e-05 12.14203
## MIR31HG 2.073896 6.040960 11.57443 7.379672e-09 3.308643e-05 10.54761
## PAPD5 1.375918 6.605216 11.44653 8.575446e-09 3.308643e-05 10.41218
## GSTCD 1.155074 6.605142 11.37932 9.284674e-09 3.308643e-05 10.34037
## SMAD3 -1.882671 9.766203 -11.14164 1.233489e-08 3.308643e-05 10.08279
save(deg,file = 'deg.Rdata')0.4.2 Plot volcano
0.4.2.1 Import data
load(file = 'deg.Rdata')
head(deg)## logFC AveExpr t P.Value adj.P.Val B
## ESM1 5.309799 7.273859 17.35499 2.597893e-11 6.054909e-07 15.30372
## RFC4 1.899358 7.636704 13.20487 1.215485e-09 1.416465e-05 12.14203
## MIR31HG 2.073896 6.040960 11.57443 7.379672e-09 3.308643e-05 10.54761
## PAPD5 1.375918 6.605216 11.44653 8.575446e-09 3.308643e-05 10.41218
## GSTCD 1.155074 6.605142 11.37932 9.284674e-09 3.308643e-05 10.34037
## SMAD3 -1.882671 9.766203 -11.14164 1.233489e-08 3.308643e-05 10.08279
bp(dat[rownames(deg)[1],])Figure 6: boxplot with group
0.4.2.2 Plot and save
nrDEG=deg
head(nrDEG)## logFC AveExpr t P.Value adj.P.Val B
## ESM1 5.309799 7.273859 17.35499 2.597893e-11 6.054909e-07 15.30372
## RFC4 1.899358 7.636704 13.20487 1.215485e-09 1.416465e-05 12.14203
## MIR31HG 2.073896 6.040960 11.57443 7.379672e-09 3.308643e-05 10.54761
## PAPD5 1.375918 6.605216 11.44653 8.575446e-09 3.308643e-05 10.41218
## GSTCD 1.155074 6.605142 11.37932 9.284674e-09 3.308643e-05 10.34037
## SMAD3 -1.882671 9.766203 -11.14164 1.233489e-08 3.308643e-05 10.08279
attach(nrDEG)
#plot(logFC,-log10(P.Value))
library(ggpubr)
df=nrDEG
df$v= -log10(P.Value) #df新增加一列'v',值为-log10(P.Value)
#ggscatter(df, x = "logFC", y = "v",size=0.5)
df$g=ifelse(df$P.Value>0.009,'stable', #if 判断:如果这一基因的P.Value>0.01,则为stable基因
ifelse( df$logFC >1.129,'up', #接上句else 否则:接下来开始判断那些P.Value<0.01的基因,再if 判断:如果logFC >1.5,则为up(上调)基因
ifelse( df$logFC < -1,'down','stable') )#接上句else 否则:接下来开始判断那些logFC <1.5 的基因,再if 判断:如果logFC <1.5,则为down(下调)基因,否则为stable基因
)
table(df$g)##
## down stable up
## 161 22695 451
这个上下调的数量和原文存在差异
图中是原文(Kobayashi et al. 2013)中的上下调数量和原文(Kobayashi et al. 2013)Table_S1中的阈值Figure 7
Figure 7: 原文中差异基因分析的结果
0.4.2.3 火山图
df$name=rownames(df)
head(df)## logFC AveExpr t P.Value adj.P.Val B
## ESM1 5.309799 7.273859 17.35499 2.597893e-11 6.054909e-07 15.30372
## RFC4 1.899358 7.636704 13.20487 1.215485e-09 1.416465e-05 12.14203
## MIR31HG 2.073896 6.040960 11.57443 7.379672e-09 3.308643e-05 10.54761
## PAPD5 1.375918 6.605216 11.44653 8.575446e-09 3.308643e-05 10.41218
## GSTCD 1.155074 6.605142 11.37932 9.284674e-09 3.308643e-05 10.34037
## SMAD3 -1.882671 9.766203 -11.14164 1.233489e-08 3.308643e-05 10.08279
## v g name
## ESM1 10.585379 up ESM1
## RFC4 8.915250 up RFC4
## MIR31HG 8.131963 up MIR31HG
## PAPD5 8.066743 up PAPD5
## GSTCD 8.032233 up GSTCD
## SMAD3 7.908865 down SMAD3
#ggscatter(df, x = "logFC", y = "v",size=0.5,color = 'g')
ggscatter(df, x = "logFC", y = "v", color = "g",size = 0.5,
label = "name", repel = T,
label.select = rownames(df)[df$g != 'stable'] ,
#label.select = c('TTC9', 'AQP3', 'CXCL11','PTGS2'), #挑选一些基因在图中显示出来
palette = c("#00AFBB", "#E7B800", "#FC4E07") )## Warning: ggrepel: 609 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave('volcano.png')## Saving 4 x 3 in image
## Warning: ggrepel: 609 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#ggscatter(df, x = "AveExpr", y = "logFC",size = 0.2)
df$p_c = ifelse(df$P.Value<0.001,'p<0.001',
ifelse(df$P.Value<0.01,'0.001<p<0.01','p>0.01'))
table(df$p_c )##
## 0.001<p<0.01 p<0.001 p>0.01
## 3028 2682 17597
ggscatter(df,x = "AveExpr", y = "logFC", color = "p_c",size=0.2,
palette = c("green", "red", "black") )ggsave('MA.png')## Saving 4 x 3 in image
0.4.3 Plot Heatmap
load(file = 'step1-output.Rdata')
# 每次都要检测数据
dat[1:4,1:4]## GSM1045517 GSM1045518 GSM1045519 GSM1045520
## ZZZ3 8.410917 8.709046 8.515850 8.203891
## ZZEF1 7.584177 7.935231 7.295273 7.044309
## ZYX 11.880791 11.455879 11.976036 11.144108
## ZYG11B 8.738334 8.592477 8.736648 8.828810
table(group_list)## group_list
## control NSCLP
## 6 7
x=deg$logFC #deg取logFC这列并将其重新赋值给x
names(x)=rownames(deg) #deg取probe_id这列,并将其作为名字给x
cg=c(names(head(sort(x),10)),#对x进行从小到大排列,取前100及后100,并取其对应的探针名,作为向量赋值给cg
names(tail(sort(x),10)))
library(pheatmap)
#pheatmap(dat[cg,],show_colnames =F,show_rownames = F) #对dat按照cg取行,所得到的矩阵来画热图
n=t(scale(t(dat[cg,])))#通过“scale”对log-ratio数值进行归一化,现在的dat是行名为探针,列名为样本名,由于scale这个函数应用在不同组数据间存在差异时,需要行名为样本,因此需要用t(dat[cg,])来转换,最后再转换回来
n[n>2]=2
n[n< -2]= -2
n[1:4,1:4]## GSM1045517 GSM1045518 GSM1045519 GSM1045520
## CCL2 1.4315375 -1.1854341 1.7125309 0.7999939
## CLDN11 -0.6073100 1.5482222 1.3147668 1.1732090
## DHRS3 0.2253149 0.6391006 0.5073107 1.8124901
## C10orf10 1.4465775 -0.6861093 0.7594651 0.8309920
#pheatmap(n,show_colnames =F,show_rownames = F)
ac=data.frame(group=group_list)
rownames(ac)=colnames(n) #将ac的行名也就分组信息(是‘no TNBC’还是‘TNBC’)给到n的列名,即热图中位于上方的分组信息
pheatmap(n,show_colnames =T,
show_rownames = T,
cluster_cols = T,
annotation_col=ac,filename = 'heatmap_top20_DEG.png') #列名注释信息为ac即分组信息保存这一步的结果供后使用
write.csv(deg,file = 'deg.csv')0.5 step4-anno-go-kegg
这一步是富集分析
0.5.1 Preparation
载入上一步保存的数据
load(file = 'deg.Rdata')
head(deg)## logFC AveExpr t P.Value adj.P.Val B
## ESM1 5.309799 7.273859 17.35499 2.597893e-11 6.054909e-07 15.30372
## RFC4 1.899358 7.636704 13.20487 1.215485e-09 1.416465e-05 12.14203
## MIR31HG 2.073896 6.040960 11.57443 7.379672e-09 3.308643e-05 10.54761
## PAPD5 1.375918 6.605216 11.44653 8.575446e-09 3.308643e-05 10.41218
## GSTCD 1.155074 6.605142 11.37932 9.284674e-09 3.308643e-05 10.34037
## SMAD3 -1.882671 9.766203 -11.14164 1.233489e-08 3.308643e-05 10.08279
不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。
#logFC_t=1.5
deg$g=ifelse(deg$P.Value>0.009,'stable',
ifelse( deg$logFC > 1.129,'UP',
ifelse( deg$logFC < -1,'DOWN','stable') )
)
table(deg$g)##
## DOWN stable UP
## 161 22695 451
head(deg)## logFC AveExpr t P.Value adj.P.Val B g
## ESM1 5.309799 7.273859 17.35499 2.597893e-11 6.054909e-07 15.30372 UP
## RFC4 1.899358 7.636704 13.20487 1.215485e-09 1.416465e-05 12.14203 UP
## MIR31HG 2.073896 6.040960 11.57443 7.379672e-09 3.308643e-05 10.54761 UP
## PAPD5 1.375918 6.605216 11.44653 8.575446e-09 3.308643e-05 10.41218 UP
## GSTCD 1.155074 6.605142 11.37932 9.284674e-09 3.308643e-05 10.34037 UP
## SMAD3 -1.882671 9.766203 -11.14164 1.233489e-08 3.308643e-05 10.08279 DOWN
将gene id symbol 转化成 entrezgene ID
deg$symbol=rownames(deg)
df <- bitr(unique(deg$symbol), fromType = "SYMBOL",
toType = c( "ENTREZID"),
OrgDb = org.Hs.eg.db)
head(df)## SYMBOL ENTREZID
## 1 ESM1 11082
## 2 RFC4 5984
## 3 MIR31HG 554202
## 5 GSTCD 79807
## 6 SMAD3 4088
## 7 DSCC1 79075
合并保存数据集
DEG=deg
head(DEG)## logFC AveExpr t P.Value adj.P.Val B g
## ESM1 5.309799 7.273859 17.35499 2.597893e-11 6.054909e-07 15.30372 UP
## RFC4 1.899358 7.636704 13.20487 1.215485e-09 1.416465e-05 12.14203 UP
## MIR31HG 2.073896 6.040960 11.57443 7.379672e-09 3.308643e-05 10.54761 UP
## PAPD5 1.375918 6.605216 11.44653 8.575446e-09 3.308643e-05 10.41218 UP
## GSTCD 1.155074 6.605142 11.37932 9.284674e-09 3.308643e-05 10.34037 UP
## SMAD3 -1.882671 9.766203 -11.14164 1.233489e-08 3.308643e-05 10.08279 DOWN
## symbol
## ESM1 ESM1
## RFC4 RFC4
## MIR31HG MIR31HG
## PAPD5 PAPD5
## GSTCD GSTCD
## SMAD3 SMAD3
DEG=merge(DEG,df,by.y='SYMBOL',by.x='symbol')
head(DEG)## symbol logFC AveExpr t P.Value adj.P.Val B
## 1 A1BG -0.37364855 6.363918 -2.9350299 0.01027419 0.04154418 -3.070312
## 2 A1CF 0.04454577 4.381061 0.5398751 0.59723850 0.71918562 -6.467735
## 3 A2M 0.66246897 6.043943 0.9661091 0.34936825 0.50177014 -6.144715
## 4 A2ML1 -0.16498639 4.243355 -1.4348016 0.17194442 0.30586999 -5.609299
## 5 A3GALT2 -0.13777204 4.615797 -1.0488482 0.31091869 0.46262653 -6.062994
## 6 A4GALT -0.56697877 7.073481 -2.5093687 0.02410901 0.07567794 -3.874387
## g ENTREZID
## 1 stable 1
## 2 stable 29974
## 3 stable 2
## 4 stable 144568
## 5 stable 127550
## 6 stable 53947
save(DEG,file = 'anno_DEG.Rdata')筛选差异数据并保存ENTREZID
gene_up= DEG[DEG$g == 'UP','ENTREZID']
gene_down=DEG[DEG$g == 'DOWN','ENTREZID']
gene_diff=c(gene_up,gene_down)
gene_all=as.character(DEG[ ,'ENTREZID'] )
data(geneList, package="DOSE")
head(geneList)## 4312 8318 10874 55143 55388 991
## 4.572613 4.514594 4.418218 4.144075 3.876258 3.677857
logFC的box plot
par(mfrow=c(1,2))
boxplot(geneList)
boxplot(DEG$logFC)Figure 8: boxplot of Dose and logFC
整合ENTREZID并排序
geneList=DEG$logFC
names(geneList)=DEG$ENTREZID
geneList=sort(geneList,decreasing = T)0.5.2 富集分析
载入富集分析脚本并进行分析
#source('kegg_and_go_up_and_down.R')
#run_kegg(gene_up,gene_down,pro='yonghe')Figure 9: Pathway 富集
0.5.3 GO分析
go <- enrichGO(gene_up, OrgDb = "org.Hs.eg.db", ont="all")作图
barplot(go, split="ONTOLOGY",font.size =10)+
facet_grid(ONTOLOGY~., scale="free") +
scale_x_discrete(labels=function(x) str_wrap(x, width=50))+
ggsave('gene_up_GO_all_barplot.png')## Saving 10 x 6 in image
Figure 10: gene_up_GO_all_barplot
下调的GO分析没有找到结果
god <- enrichGO(gene_down, OrgDb = "org.Hs.eg.db", ont="all")
# barplot(go, split="ONTOLOGY",font.size =10)+
# facet_grid(ONTOLOGY~., scale="free") +
# scale_x_discrete(labels=function(x) str_wrap(x, width=50))+
# ggsave('gene_down_GO_all_barplot.png')
god## #
## # over-representation test
## #
## #...@organism Homo sapiens
## #...@ontology GOALL
## #...@keytype ENTREZID
## #...@gene chr [1:146] "81926" "133" "147" "50808" "221" "55107" "83464" "60370" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05
## #...0 enriched terms found
## #...Citation
## Guangchuang Yu, Li-Gen Wang, Yanyan Han and Qing-Yu He.
## clusterProfiler: an R package for comparing biological themes among
## gene clusters. OMICS: A Journal of Integrative Biology
## 2012, 16(5):284-287
0.5.4 与原文对比
比较一下我的和它的GO分析图Figure 11: 原文(左)和我的(右)GO分析图